# vllm_serve.py
from vllm import SamplingParams
from vllm import LLM
import argparse


def setup_vllm_service():
    """设置vLLM推理服务"""
    # 加载微调后的模型
    llm = LLM(
        model="./qwen3-distilled-model",
        tokenizer="./qwen3-distilled-model",
        max_model_len=2048,
        gpu_memory_utilization=0.85,  # 4060显卡内存利用率
        quantization="awq",  # 可选量化
        trust_remote_code=True,
    )

    return llm


def create_openai_api_server(llm):
    """创建兼容OpenAI API的服务"""
    from vllm.entrypoints.openai import api_server

    # 启动服务（在终端中运行）
    # vllm serve ./qwen3-distilled-model --port 8000 --max-model-len 2048
    return llm


if __name__ == "__main__":
    llm = setup_vllm_service()
    print("模型加载完成，准备启动服务...")